suppressPackageStartupMessages(library(tidyverse))
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities
wd <- '~/Google Drive/My Drive/Analysis/METTL2A/'

figdir <- paste0(wd, 'Figures/DRS_diffthresh/')
tabledir <- paste0(wd, 'Tables/DRS_diffthresh/')

FUnctions

paste_wd <- function(path) {
  paste0(wd, path)
}

filter_intensityup_sites_pval <- function(df, pval) {
  sampcomp_results_joined |> 
    filter(
      KS_intensity_pvalue_G < pval &
        KS_intensity_pvalue_I < pval 
    ) |> 
    filter(
      c2_median_intensity_G - c1_median_intensity_G > 0 &
        c2_median_intensity_I - c1_median_intensity_I > 0
    ) 
}

get_neighbor_seq <- function(df, neighbor_length = 5) {
  df |> 
    select(transcript_id:ref_kmer) |> 
    left_join(espresso_AsPC1_transcriptome_seqs) |> 
    mutate(
      seq = str_sub(
        transcript_seq, 
        position + 1 - neighbor_length, 
        position + 5 + neighbor_length
      )
    ) 
}

Read data

sampcomp_results_joined <- 
  read_tsv(
    'Tables/DRS/Positions/sampcomp_results_joined_2024-04-09.tsv.gz' |> 
      paste_wd()
  )
## Rows: 5884004 Columns: 67
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (34): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (33): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sampcomp_results_joined
## # A tibble: 5,884,004 × 67
##    transcript_id     transcript_name position ref_kmer GMM_logit_pvalue_G
##    <chr>             <chr>              <dbl> <chr>                 <dbl>
##  1 ENST00000264926.7 RAD18-201           1464 TCACA                    NA
##  2 ENST00000264926.7 RAD18-201           1465 CACAT                     1
##  3 ENST00000264926.7 RAD18-201           1466 ACATA                    NA
##  4 ENST00000264926.7 RAD18-201           1467 CATAA                     1
##  5 ENST00000264926.7 RAD18-201           1468 ATAAA                    NA
##  6 ENST00000264926.7 RAD18-201           1473 AACGA                     1
##  7 ENST00000264926.7 RAD18-201           1475 CGATC                    NA
##  8 ENST00000264926.7 RAD18-201           1486 ACACA                    NA
##  9 ENST00000264926.7 RAD18-201           1501 CAAGA                     1
## 10 ENST00000264926.7 RAD18-201           1502 AAGAC                    NA
## # ℹ 5,883,994 more rows
## # ℹ 62 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## #   GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## #   Logit_LOR_G <chr>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## #   c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## #   c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## #   c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
espresso_AsPC1_transcriptome_seqs <- 
  read_tsv(
    'Tables/Database/espresso_AsPC1_transcriptome_seqs_2024-04-22.tsv.gz' |> paste_wd()
  )
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, transcript_seq
## dbl (1): transcript_length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_AsPC1_transcriptome_seqs
## # A tibble: 36,717 × 3
##    transcript_id      transcript_seq                           transcript_length
##    <chr>              <chr>                                                <dbl>
##  1 ENST00000339437.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCA…               987
##  2 ENST00000251607.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCA…              2252
##  3 ENST00000420393.5  CAGCGGGGCCGGTAAGCGGGCGCGCGCCGCTCAGAGGGG…               854
##  4 ENST00000698415.1  GATGTATGATGAGTTTAGTTGAATGCTCGTGTTGCTGTC…              6597
##  5 ENST00000698416.1  CATGACTAGTTTTGTGGGTAGCAATGATGTTTAAATGTC…              5500
##  6 ENST00000488263.5  AGGAACTTCATCATGAAGTCTCAAGTAAACGAACATTTT…              4528
##  7 ENST00000424814.5  GAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACC…              2038
##  8 ENST00000231948.9  AGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGCA…              2187
##  9 ENST00000432408.6  GCCTCCTTTGCGGGTAAACAGACATGGCCGGCGAAGGAG…              2203
## 10 ENST00000459840.5  ATGGAGGCATTTAAACTGGGACTGAGATGGGACTGAGTG…               723
## # ℹ 36,707 more rows

Check

sampcomp_results_joined |> 
  filter_intensityup_sites_pval(pval = .05) |> 
  get_neighbor_seq(neighbor_length = 0) |> 
  filter(ref_kmer == seq)
## Joining with `by = join_by(transcript_id)`
## # A tibble: 605 × 7
##    transcript_id     transcript_name position ref_kmer transcript_seq           
##    <chr>             <chr>              <dbl> <chr>    <chr>                    
##  1 ENST00000429711.7 RPL32-204            422 GCCCA    AGCCCTTGCGCGCCACCGTCCCTT…
##  2 ENST00000647248.2 RPL35A-211           380 ACCCC    CTTCTCTTACCGCCATCTTGGCTC…
##  3 ENST00000647248.2 RPL35A-211           381 CCCCT    CTTCTCTTACCGCCATCTTGGCTC…
##  4 ENST00000389680.2 MT-RNR1-201           43 ACACA    AATAGGTTTGGTCCTAGCCTTTCT…
##  5 ENST00000389680.2 MT-RNR1-201           57 CCCCG    AATAGGTTTGGTCCTAGCCTTTCT…
##  6 ENST00000389680.2 MT-RNR1-201           71 GTTCA    AATAGGTTTGGTCCTAGCCTTTCT…
##  7 ENST00000389680.2 MT-RNR1-201           73 TCACC    AATAGGTTTGGTCCTAGCCTTTCT…
##  8 ENST00000389680.2 MT-RNR1-201           75 ACCCT    AATAGGTTTGGTCCTAGCCTTTCT…
##  9 ENST00000389680.2 MT-RNR1-201           93 ATCAA    AATAGGTTTGGTCCTAGCCTTTCT…
## 10 ENST00000389680.2 MT-RNR1-201          138 GCTTA    AATAGGTTTGGTCCTAGCCTTTCT…
## # ℹ 595 more rows
## # ℹ 2 more variables: transcript_length <dbl>, seq <chr>
fasta_dir <- 'Fasta/DRS_diffthresh/Neighbor_5/' |> paste_wd()


pval_threshs <- c(.001, .01, .05, .1)


export_neighbor_seq_increased_currentintensity_sites_pval <- function(pval) {
  
  fasta_basename <- paste0('increased_currentintensity_sites_', pval)
  sampcomp_results_joined |> 
    filter_intensityup_sites_pval(pval = pval) |> 
    get_neighbor_seq(neighbor_length = 5) |> 
    select(transcript_id:position, seq) |> 
    mutate(name = paste0(transcript_id, '|', position)) |> 
    export_tsv(basename = fasta_basename, outdir = fasta_dir) |> 
    export_as_fasta(
      name = name, sequence = seq, fasta_basename = fasta_basename, 
      outdir = fasta_dir, compression = '')
  
}

pval_threshs |> 
  walk(export_neighbor_seq_increased_currentintensity_sites_pval)
## Joining with `by = join_by(transcript_id)`
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS_diffthresh/Neighbor_5/increased_currentintensity_sites_0.001_2025-07-19.tsv
## 
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS_diffthresh/Neighbor_5/increased_currentintensity_sites_0.001.fa
## Joining with `by = join_by(transcript_id)`
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS_diffthresh/Neighbor_5/increased_currentintensity_sites_0.01_2025-07-19.tsv
## 
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS_diffthresh/Neighbor_5/increased_currentintensity_sites_0.01.fa
## Joining with `by = join_by(transcript_id)`
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS_diffthresh/Neighbor_5/increased_currentintensity_sites_0.05_2025-07-19.tsv
## 
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS_diffthresh/Neighbor_5/increased_currentintensity_sites_0.05.fa
## Joining with `by = join_by(transcript_id)`
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS_diffthresh/Neighbor_5/increased_currentintensity_sites_0.1_2025-07-19.tsv
## 
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS_diffthresh/Neighbor_5/increased_currentintensity_sites_0.1.fa

Check CC motifs

calc_percentage_groupedby_middle_consecutiveC <- function(pval) {
  sampcomp_results_joined |> 
    filter_intensityup_sites_pval(pval = pval) |> 
    group_by(middleC_info) |> 
    reframe(n = n()) |> 
    mutate(
      percentage = 100 * n / sum(n), 
      pval = paste0('pval < ', pval),
    ) |> 
    select(pval, everything())
}

percentages_middle_consecutiveCs <- 
  pval_threshs |> 
  map(calc_percentage_groupedby_middle_consecutiveC) |> 
  bind_rows()
percentages_middle_consecutiveCs |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_diffthresh/percentages_middle_consecutiveCs_2025-07-19.tsv
## # A tibble: 16 × 4
##    pval         middleC_info     n percentage
##    <chr>        <chr>        <int>      <dbl>
##  1 pval < 0.001 C1              17       8.25
##  2 pval < 0.001 C2              67      32.5 
##  3 pval < 0.001 C3              75      36.4 
##  4 pval < 0.001 others          47      22.8 
##  5 pval < 0.01  C1              28       7.55
##  6 pval < 0.01  C2             138      37.2 
##  7 pval < 0.01  C3             129      34.8 
##  8 pval < 0.01  others          76      20.5 
##  9 pval < 0.05  C1              46       7.60
## 10 pval < 0.05  C2             247      40.8 
## 11 pval < 0.05  C3             196      32.4 
## 12 pval < 0.05  others         116      19.2 
## 13 pval < 0.1   C1              59       7.68
## 14 pval < 0.1   C2             309      40.2 
## 15 pval < 0.1   C3             237      30.9 
## 16 pval < 0.1   others         163      21.2
percentages_middle_consecutiveCs_barplot <- 
  percentages_middle_consecutiveCs |> 
  mutate(middleC_info = factor(middleC_info, levels = c('C3', 'C2', 'C1', 'others'))) |> 
  ggplot(aes(
    x = pval |> as.character(), y = percentage, 
    fill = middleC_info |> fct_rev())) +
  geom_bar(stat = 'identity', colour = 'gray20', size = 0.4) +
  scale_y_continuous(breaks = seq(0, 100, 20)) +
  scale_fill_manual(
    values = c('#BEBEBE', '#c5c5fb', '#7777F5', '#3131c1')
  ) +
  coord_flip()  
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
percentages_middle_consecutiveCs_barplot |> 
  ggsave_pdf(
    width = 8, height = 4, outdir = figdir
  )